PCA

Code
df = read.csv("data/wisconsin_data.csv")
# 주성분 분석을 할 설명변수 데이터만 분리 
feature = df[,3:12] # 1,2 열은 각각 환자 ID, 진단
head(feature)
  radius_mean texture_mean perimeter_mean area_mean smoothness_mean
1       17.99        10.38         122.80    1001.0         0.11840
2       20.57        17.77         132.90    1326.0         0.08474
3       19.69        21.25         130.00    1203.0         0.10960
4       11.42        20.38          77.58     386.1         0.14250
5       20.29        14.34         135.10    1297.0         0.10030
6       12.45        15.70          82.57     477.1         0.12780
  compactness_mean concavity_mean concave.points_mean symmetry_mean
1          0.27760         0.3001             0.14710        0.2419
2          0.07864         0.0869             0.07017        0.1812
3          0.15990         0.1974             0.12790        0.2069
4          0.28390         0.2414             0.10520        0.2597
5          0.13280         0.1980             0.10430        0.1809
6          0.17000         0.1578             0.08089        0.2087
  fractal_dimension_mean
1                0.07871
2                0.05667
3                0.05999
4                0.09744
5                0.05883
6                0.07613

주성분 분석

Code
pca = prcomp(feature, scale = T) # scale=T; 변수값 표준화(평균을 빼고, 표준편차로 나누어줌) 
pca
Standard deviations (1, .., p=10):
 [1] 2.34063837 1.58704555 0.93841099 0.70640600 0.61035989 0.35233755
 [7] 0.28299348 0.18678810 0.10552469 0.01680196

Rotation (n x k) = (10 x 10):
                               PC1          PC2         PC3          PC4
radius_mean            -0.36393793  0.313929073 -0.12442759  0.029558858
texture_mean           -0.15445113  0.147180909  0.95105659  0.008916084
perimeter_mean         -0.37604434  0.284657885 -0.11408360  0.013458069
area_mean              -0.36408585  0.304841714 -0.12337786  0.013442682
smoothness_mean        -0.23248053 -0.401962324 -0.16653247 -0.107802033
compactness_mean       -0.36444206 -0.266013147  0.05827786 -0.185700413
concavity_mean         -0.39574849 -0.104285968  0.04114649 -0.166653523
concave.points_mean    -0.41803840 -0.007183605 -0.06855383 -0.072983951
symmetry_mean          -0.21523797 -0.368300910  0.03672364  0.892998475
fractal_dimension_mean -0.07183744 -0.571767700  0.11358395 -0.349331790
                                PC5          PC6         PC7          PC8
radius_mean            -0.031067022  0.264180150 -0.04418839  0.084834062
texture_mean           -0.219922761  0.032206572  0.02055748 -0.007126797
perimeter_mean         -0.005945081  0.237819464 -0.08336923  0.089258879
area_mean              -0.019341222  0.331707454  0.26118796  0.144609749
smoothness_mean        -0.843745292 -0.062225368  0.01129197  0.170503128
compactness_mean        0.240182967 -0.005271104 -0.80380484  0.063980134
concavity_mean          0.312533244 -0.601467155  0.36713629  0.449573315
concave.points_mean    -0.009180198 -0.265613395  0.14131308 -0.850918762
symmetry_mean           0.112888068  0.061957003  0.04790201  0.016455606
fractal_dimension_mean  0.264878077  0.567918997  0.34521359 -0.065259461
                                PC9          PC10
radius_mean             0.474425305 -0.6690714888
texture_mean            0.004212629  0.0002497826
perimeter_mean          0.380167210  0.7404905337
area_mean              -0.747347357 -0.0323589585
smoothness_mean         0.005847386  0.0036904058
compactness_mean       -0.218732407 -0.0527527802
concavity_mean          0.081170670 -0.0103668020
concave.points_mean    -0.022024652 -0.0037475480
symmetry_mean           0.009067850  0.0014669472
fractal_dimension_mean  0.129667491  0.0070573477

주성분 분석의 결과

Code
attributes(pca)
$names
[1] "sdev"     "rotation" "center"   "scale"    "x"       

$class
[1] "prcomp"

주성분 변환 계수(loading vectors)

Code
pca$rotation
                               PC1          PC2         PC3          PC4
radius_mean            -0.36393793  0.313929073 -0.12442759  0.029558858
texture_mean           -0.15445113  0.147180909  0.95105659  0.008916084
perimeter_mean         -0.37604434  0.284657885 -0.11408360  0.013458069
area_mean              -0.36408585  0.304841714 -0.12337786  0.013442682
smoothness_mean        -0.23248053 -0.401962324 -0.16653247 -0.107802033
compactness_mean       -0.36444206 -0.266013147  0.05827786 -0.185700413
concavity_mean         -0.39574849 -0.104285968  0.04114649 -0.166653523
concave.points_mean    -0.41803840 -0.007183605 -0.06855383 -0.072983951
symmetry_mean          -0.21523797 -0.368300910  0.03672364  0.892998475
fractal_dimension_mean -0.07183744 -0.571767700  0.11358395 -0.349331790
                                PC5          PC6         PC7          PC8
radius_mean            -0.031067022  0.264180150 -0.04418839  0.084834062
texture_mean           -0.219922761  0.032206572  0.02055748 -0.007126797
perimeter_mean         -0.005945081  0.237819464 -0.08336923  0.089258879
area_mean              -0.019341222  0.331707454  0.26118796  0.144609749
smoothness_mean        -0.843745292 -0.062225368  0.01129197  0.170503128
compactness_mean        0.240182967 -0.005271104 -0.80380484  0.063980134
concavity_mean          0.312533244 -0.601467155  0.36713629  0.449573315
concave.points_mean    -0.009180198 -0.265613395  0.14131308 -0.850918762
symmetry_mean           0.112888068  0.061957003  0.04790201  0.016455606
fractal_dimension_mean  0.264878077  0.567918997  0.34521359 -0.065259461
                                PC9          PC10
radius_mean             0.474425305 -0.6690714888
texture_mean            0.004212629  0.0002497826
perimeter_mean          0.380167210  0.7404905337
area_mean              -0.747347357 -0.0323589585
smoothness_mean         0.005847386  0.0036904058
compactness_mean       -0.218732407 -0.0527527802
concavity_mean          0.081170670 -0.0103668020
concave.points_mean    -0.022024652 -0.0037475480
symmetry_mean           0.009067850  0.0014669472
fractal_dimension_mean  0.129667491  0.0070573477

주성분 변환 값(scores)

Code
head(pca$x)
           PC1        PC2        PC3        PC4        PC5         PC6
[1,] -5.219562 -3.2016111 -2.1694307 -0.1691271  1.5129208 -0.11302355
[2,] -1.726575  2.5386054 -1.0187821  0.5470581  0.3120551  0.93481161
[3,] -3.966267  0.5495913 -0.3232843  0.3976143 -0.3225932 -0.27125473
[4,] -3.593551 -6.8989994  0.7921346 -0.6042963  0.2429625  0.61642725
[5,] -3.148321  1.3568784 -1.8605969 -0.1850886  0.3110679 -0.09069778
[6,] -1.380105 -3.3114977 -0.6973879 -0.4723685 -0.5006185 -0.16262179
             PC7          PC8         PC9         PC10
[1,] -0.34438133 -0.231727880  0.02196273 -0.011247764
[2,]  0.42055208 -0.008335534  0.05612189 -0.022971998
[3,]  0.07643917 -0.354737945 -0.02009818 -0.022654878
[4,] -0.06799091 -0.100074939  0.04344302 -0.053409301
[5,]  0.30781641  0.098969999  0.02655061  0.034082655
[6,]  0.06429129 -0.099994918  0.04840093 -0.006434524

Scree plot (I)

To visualize the percentage of variation explained by each principal component

Code
plot(pca) 

Scree plot (II)

Code
var_explained = pca$sdev^2 / sum(pca$sdev^2)
var_explained_df = data.frame(num = 1:10, var = var_explained)
library(ggplot2)
ggplot(var_explained_df, aes(num, var)) + 
  geom_point() + 
  geom_line() + 
  xlab("Principal Component") + 
  ylab("Variance Explained") +
  ggtitle("Scree Plot") +
  ylim(0, 1)

주성분값 추출

Code
pc = pca$x # 각 환자별 주성분(PC1, PC2, ... PC10) 값 
pc12 = pc[,1:2] # PC1, PC2 선택 
df12 = data.frame(diagnosis = df$diagnosis, pc12) 
head(df12)
  diagnosis       PC1        PC2
1         M -5.219562 -3.2016111
2         M -1.726575  2.5386054
3         M -3.966267  0.5495913
4         M -3.593551 -6.8989994
5         M -3.148321  1.3568784
6         M -1.380105 -3.3114977

시각화

Code
library(ggplot2)
ggplot(df12, aes(x = PC1, y = PC2, col = diagnosis)) + 
  geom_point() 

Biplot (I)

Biplot = PCA score plot + loading plot - Data points.
- Arrows.

Code
biplot(pca)

Biplot (II)

Code
library(factoextra)
fviz_pca_biplot(pca, label = "var", habillage = df$diagnosis)